{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Highlights of the Code\n",
    "\n",
    "1. **Replication of Table E**  \n",
    "   This code replicates Table E from the Internet Appendix of the paper.\n",
    "\n",
    "<br>\n",
    "\n",
    "2. **Forecasting Risk Premiums using the Lewellen model**  \n",
    "   The code demonstrates how to forecast risk premiums using the Lewellen Model \n",
    "   and generates the corresponding forecast confidence intervals.\n",
    "\n",
    "<br>\n",
    "\n",
    "3. **175 Stock-Level Predictors as Inputs**  \n",
    "   It utilizes 14 stock-level predictors to produce Lewellen-based risk premium forecasts  \n",
    "   along with their forecast standard errors.\n",
    "\n",
    "<br>\n",
    "\n",
    "4. **Model Retraining Schedule**  \n",
    "   - Models are retrained at the start of each year.  \n",
    "   - With out-of-sample data spanning 1987 to 2020, the models are retrained 34 times.  \n",
    "   - **Initial training sample**: 1957 to 1974.  \n",
    "   - The training sample expands recursively each year.  \n",
    "   - **Initial validation sample**: 1975 to 1986.  \n",
    "   - The validation sample advances forward annually.\n",
    "\n",
    "\n",
    "<br>\n",
    "\n",
    "5. **Copyright Protection**  \n",
    "   The code is intended solely for research and non-commercial purposes.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np # linear algebra\n",
    "import multiprocessing as mp\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "import seaborn as sns\n",
    "import numpy\n",
    "import pandas\n",
    "from joblib import Parallel, delayed\n",
    "import keras  \n",
    "from keras import regularizers\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from keras.wrappers.scikit_learn import KerasRegressor\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.pipeline import Pipeline\n",
    "from keras.layers import BatchNormalization\n",
    "from keras.layers import Activation\n",
    "from keras import optimizers\n",
    "from sklearn.metrics import r2_score\n",
    "from keras import callbacks\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
    "from keras.layers import Dropout\n",
    "from keras.models import load_model\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.linear_model import ElasticNet\n",
    "from numpy.linalg import inv\n",
    "from scipy import stats\n",
    "import scipy\n",
    "from scipy import stats\n",
    "from scipy.optimize import minimize\n",
    "from scipy.stats import invgauss\n",
    "from sklearn import linear_model\n",
    "#from scipy.stats import invgauss\n",
    "import scipy.stats as sc\n",
    "from numpy.linalg import matrix_rank\n",
    "from sklearn.linear_model import LinearRegression\n",
    "#import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np # linear algebra\n",
    "import multiprocessing as mp\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "import seaborn as sns\n",
    "import numpy\n",
    "import pandas\n",
    "from joblib import Parallel, delayed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import keras  \n",
    "from keras import regularizers\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from keras.wrappers.scikit_learn import KerasRegressor\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.pipeline import Pipeline\n",
    "from keras.layers import BatchNormalization\n",
    "from keras.layers import Activation\n",
    "from keras import optimizers\n",
    "from sklearn.metrics import r2_score\n",
    "from keras import callbacks\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
    "from keras.layers import Dropout\n",
    "from keras.models import load_model\n",
    "from sklearn.metrics import mean_squared_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def Gibbs_sampler_lambda_wls(X, Y_tilde, xerror, errorbetain,h1, h2, n):\n",
    "    \n",
    "    beta=np.zeros((n,X.shape[1]))\n",
    "    errorbeta=np.zeros((n,xerror.shape[1]))\n",
    "    #tau_sq=np.zeros((n,X.shape[1]))\n",
    "    sigma_sq=np.zeros((n,))\n",
    "    #lambda_sq=np.zeros((n,))\n",
    "    # Initialization\n",
    "    errorbeta[0]=errorbetain\n",
    "    beta[0] = np.random.uniform(size = X.shape[1])\n",
    "    sigma_sq[0] = np.random.uniform()\n",
    "    Aerror= xerror.transpose().dot(xerror)\n",
    "    Aerrorinv=np.linalg.inv(Aerror)\n",
    "    for i in range(n-1):\n",
    "        # Full conditional for beta\n",
    "        Y_error=Y_tilde-xerror.dot(errorbeta[i])\n",
    "        A = X.transpose().dot(X) #+ D_tau_inv\n",
    "        Ainv=np.linalg.inv(A)\n",
    "        multi_norm_mean = Ainv.dot(X.transpose()).dot(Y_error)\n",
    "        multi_norm_cov = sigma_sq[i] * Ainv\n",
    "        beta[i+1]=(np.random.multivariate_normal(multi_norm_mean, multi_norm_cov))\n",
    "        # Full conditional for sigma_sq\n",
    "        shape = (X.shape[0]-1+X.shape[1])\n",
    "        scale = ((Y_error - X.dot(beta[i+1])).dot((Y_error - X.dot(beta[i+1]))) )#+ beta[i+1].transpose().dot(D_tau_inv).dot(beta[i+1]))/2\n",
    "        sigma_sq[i+1]=(sc.invgamma.rvs(a = shape, scale = scale))\n",
    "         \n",
    "        errortrain=Y_tilde-X.dot(beta[i+1])\n",
    "        xyerror=xerror.transpose().dot(errortrain)\n",
    "        errorbeta_mean=Aerrorinv.dot(xyerror)\n",
    "        errorvar=np.var(Y_tilde-X.dot(beta[i+1])-xerror.dot(errorbeta[i]))\n",
    "        errorbeta[i+1]=np.random.multivariate_normal(h1*errorbeta_mean, h2*Aerrorinv*errorvar)\n",
    "    return beta[int(n/2):]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def rsq_oos(y,ypred): \n",
    "    SSR = np.sum((y-ypred)**2)\n",
    "    MSS = np.sum((y)**2)\n",
    "    r_squared =1-SSR/MSS \n",
    "    return r_squared"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import linear_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def signals_and_confidence_levels():\n",
    "    \n",
    "    for M in range(1,31):\n",
    "        np.random.seed(0)\n",
    "        gg1=pd.read_csv('twdata{}.csv'.format(M))\n",
    "        gg2=pd.read_csv('vdata{}.csv'.format(M))\n",
    "        gg3=pd.read_csv('tedata{}.csv'.format(M))\n",
    "\n",
    "        gg1['retmkt']=gg1['retmkt']-gg1['retmkt'].mean()\n",
    "        gg1['mktsize']=gg1['retmkt']*gg1['mvel1']\n",
    "        gg1['mktbm']=gg1['retmkt']*gg1['bm']\n",
    "        gg1['mktmom']=gg1['retmkt']*gg1['mom12m']\n",
    "        \n",
    "        pp=(gg1.columns.values)\n",
    "        gg1[\"yyyymm\"]= gg1[\"yyyymm\"].astype(str) \n",
    "        gg2[\"yyyymm\"]= gg2[\"yyyymm\"].astype(str) \n",
    "        gg3[\"yyyymm\"]= gg3[\"yyyymm\"].astype(str) \n",
    "        gg1=gg1[['yyyymm', 'PERMNO', 'retadj','mvel1', 'bm', 'mom12m', 'acc', 'roaq', 'agr', 'egr', 'dy', 'mom36m', 'beta', 'idiovol', 'turn', 'lev', 'sp','retmkt', 'mktsize', 'mktbm', 'mktmom']]\n",
    "        gg2=gg2[['yyyymm', 'PERMNO', 'retadj','mvel1', 'bm', 'mom12m', 'acc', 'roaq', 'agr', 'egr', 'dy', 'mom36m', 'beta', 'idiovol', 'turn', 'lev', 'sp']]\n",
    "        gg3=gg3[['yyyymm', 'PERMNO', 'retadj','mvel1', 'bm', 'mom12m', 'acc', 'roaq', 'agr', 'egr', 'dy', 'mom36m', 'beta', 'idiovol', 'turn', 'lev', 'sp']]\n",
    "        \n",
    "        xtrain=((gg1.iloc[:,3:17]))\n",
    "        xtest=((gg2.iloc[:,3:17]))\n",
    "        xoos=(gg3.iloc[:,3:17])\n",
    "        xerror=gg1[['retmkt','mktsize','mktbm','mktmom']]\n",
    "        xerror=xerror-np.mean(xerror)\n",
    "        ytrain=(gg1.iloc[:,2])\n",
    "\n",
    "        ytest=gg2.iloc[:,2]\n",
    "        yoos=gg3.iloc[:,2]\n",
    "\n",
    "        ytraind=(ytrain-np.mean(ytrain,axis=0))\n",
    "        ytestd=(ytest-np.mean(ytrain,axis=0))\n",
    "\n",
    "        xtraind=(xtrain-np.mean(xtrain,axis=0))\n",
    "        xtestd=(xtest-np.mean(xtrain,axis=0))\n",
    "        xoosd=(xoos-np.mean(xtrain,axis=0))\n",
    "        \n",
    "        ##Fitting a penalized linaer model just for comparison sake \n",
    "        lasso=linear_model.Lasso(alpha=0.0001, fit_intercept=False)\n",
    "        #linear_model.Lasso(alpha=0.01, fit_intercept=False)\n",
    "        lasso.fit(xtraind, ytraind)\n",
    "\n",
    "        # The estimator chose automatically its lambda:\n",
    "        lasso_cv_coef = lasso.coef_\n",
    "        ytrainpred=lasso.predict(xtraind)+np.mean(ytrain)\n",
    "        ytestpred=lasso.predict(xtestd)+np.mean(ytrain)\n",
    "\n",
    "        yoospred=xoosd.dot(lasso.coef_)+np.mean(ytrain)\n",
    "        errortrain=ytrain-ytrainpred\n",
    "\n",
    "        reg = LinearRegression().fit(xerror, errortrain)\n",
    "        errorfit=reg.predict(xerror)\n",
    "        errortrain=ytrain-ytrainpred\n",
    "\n",
    "        xyerror=xerror.transpose().dot(errortrain)\n",
    "        errorvar=np.var(errortrain-errorfit)\n",
    "        Aerror= xerror.transpose().dot(xerror)\n",
    "        Aerrorinv=np.linalg.inv(Aerror)\n",
    "\n",
    "\n",
    "        errorbetain=0*Aerrorinv.dot(xyerror) ##Factor loading coefficients are set to 0\n",
    "        \n",
    "        yoospred=xoosd.dot(lasso.coef_)+np.mean(ytrain)\n",
    "        beta_gibbs=Gibbs_sampler_lambda_wls(xtraind, ytraind, xerror, errorbetain, 1, 1, 100)\n",
    "        beta_bayes=np.median(beta_gibbs,axis=0)\n",
    "        yoosfreq=xoosd.dot(lasso_cv_coef)+np.mean(ytrain)\n",
    "        yoosbayes=xoosd.dot(beta_bayes)+np.mean(ytrain)\n",
    "        yoos_gibbs=np.zeros((50,yoos.shape[0]))\n",
    "\n",
    "        for i in range(50):\n",
    "\n",
    "            yoos_gibbs[i]=xoosd.dot(beta_gibbs[i])+np.mean(ytrain)\n",
    "        \n",
    "        yoosconf=np.std(yoos_gibbs,axis=0)\n",
    "        ybayesconf=pd.DataFrame(yoosconf, columns = ['ybayesconf'])\n",
    "\n",
    "        date_and_permno = gg3.iloc[0:, [0,1]]\n",
    "        yoosc=gg3.iloc[0:,2:3]\n",
    "        yoosfreqd=yoosfreq.to_frame(name=\"yoosfreq\")\n",
    "        yoosbayesd=yoosbayes.to_frame(name=\"yoosbayesd\")\n",
    "  \n",
    "\n",
    "\n",
    "        mergeout = np.concatenate((date_and_permno,yoosfreqd,yoosbayesd,ybayesconf,yoosc,gg3.iloc[0:, 3:4]),axis=1)\n",
    "        mergeout = pd.DataFrame(mergeout)\n",
    "        mergeout.to_csv('D:/bayesian jmp/lasso_output_rr/lewrollw{}.csv'.format(M), index=False, header=False)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "signals_and_confidence_levels()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **More Recent Sample**\n",
    "\n",
    "In the most recent sample uploaded by GKX, covering the years 2017–2020:\n",
    "\n",
    "1. The **price delay variable** is not available.  \n",
    "2. The **order of the first few columns** differs from the earlier sample.  \n",
    "3. Some **macroeconomic variables are not expressed in the same (log) scale** as in the initial sample (1957–2016) analyzed in the paper.  \n",
    "\n",
    "So, adjust the earlier code accordingly to **account for these differences**.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def signals_and_confidence_levels_later_sample():\n",
    "    \n",
    "    for M in range(31,35):\n",
    "        np.random.seed(0)\n",
    "        gg1=pd.read_csv('twdata{}.csv'.format(M))\n",
    "        gg2=pd.read_csv('vdata{}.csv'.format(M))\n",
    "        gg3=pd.read_csv('tedata{}.csv'.format(M))\n",
    "\n",
    "        gg1['retmkt']=gg1['retmkt']-gg1['retmkt'].mean()\n",
    "        gg1['mktsize']=gg1['retmkt']*gg1['mvel1']\n",
    "        gg1['mktbm']=gg1['retmkt']*gg1['bm']\n",
    "        gg1['mktmom']=gg1['retmkt']*gg1['mom12m']\n",
    "        \n",
    "        pp=(gg1.columns.values)\n",
    "        gg1[\"yyyymm\"]= gg1[\"yyyymm\"].astype(str) \n",
    "        gg2[\"yyyymm\"]= gg2[\"yyyymm\"].astype(str) \n",
    "        gg3[\"yyyymm\"]= gg3[\"yyyymm\"].astype(str) \n",
    "        gg1=gg1[['yyyymm', 'permno', 'retadj','mvel1', 'bm', 'mom12m', 'acc', 'roaq', 'agr', 'egr', 'dy', 'mom36m', 'beta', 'idiovol', 'turn', 'lev', 'sp','retmkt', 'mktsize', 'mktbm', 'mktmom']]\n",
    "        gg2=gg2[['yyyymm', 'permno', 'retadj','mvel1', 'bm', 'mom12m', 'acc', 'roaq', 'agr', 'egr', 'dy', 'mom36m', 'beta', 'idiovol', 'turn', 'lev', 'sp']]\n",
    "        gg3=gg3[['yyyymm', 'permno', 'retadj','mvel1', 'bm', 'mom12m', 'acc', 'roaq', 'agr', 'egr', 'dy', 'mom36m', 'beta', 'idiovol', 'turn', 'lev', 'sp']]\n",
    "        \n",
    "        xtrain=((gg1.iloc[:,3:17]))\n",
    "        xtest=((gg2.iloc[:,3:17]))\n",
    "        xoos=(gg3.iloc[:,3:17])\n",
    "        xerror=gg1[['retmkt','mktsize','mktbm','mktmom']]\n",
    "        xerror=xerror-np.mean(xerror)\n",
    "        ytrain=(gg1.iloc[:,2])\n",
    "\n",
    "        ytest=gg2.iloc[:,2]\n",
    "        yoos=gg3.iloc[:,2]\n",
    "\n",
    "        ytraind=(ytrain-np.mean(ytrain,axis=0))\n",
    "        ytestd=(ytest-np.mean(ytrain,axis=0))\n",
    "\n",
    "        xtraind=(xtrain-np.mean(xtrain,axis=0))\n",
    "        xtestd=(xtest-np.mean(xtrain,axis=0))\n",
    "        xoosd=(xoos-np.mean(xtrain,axis=0))\n",
    "        \n",
    "        ##Fitting a frequentist Linear model just for comparison sake.\n",
    "        lasso=linear_model.Lasso(alpha=0.00000000001, fit_intercept=False)\n",
    "        lasso.fit(xtraind, ytraind)\n",
    "\n",
    "        lasso_cv_coef = lasso.coef_\n",
    "        ytrainpred=lasso.predict(xtraind)+np.mean(ytrain)\n",
    "        ytestpred=lasso.predict(xtestd)+np.mean(ytrain)\n",
    "\n",
    "        yoospred=xoosd.dot(lasso.coef_)+np.mean(ytrain)\n",
    "        errortrain=ytrain-ytrainpred\n",
    "\n",
    "        reg = LinearRegression().fit(xerror, errortrain)\n",
    "        errorfit=reg.predict(xerror)\n",
    "        errortrain=ytrain-ytrainpred\n",
    "\n",
    "        xyerror=xerror.transpose().dot(errortrain)\n",
    "        errorvar=np.var(errortrain-errorfit)\n",
    "        Aerror= xerror.transpose().dot(xerror)\n",
    "        Aerrorinv=np.linalg.inv(Aerror)\n",
    "\n",
    "\n",
    "        errorbetain=0*Aerrorinv.dot(xyerror) ##Setting factor loading coeffiicents to 0\n",
    "        yoospred=xoosd.dot(lasso.coef_)+np.mean(ytrain)\n",
    "        beta_gibbs=Gibbs_sampler_lambda_wls(xtraind, ytraind, xerror, errorbetain, 1, 1, 100)\n",
    "        beta_bayes=np.median(beta_gibbs,axis=0)\n",
    "        yoosfreq=xoosd.dot(lasso_cv_coef)+np.mean(ytrain)\n",
    "        yoosbayes=xoosd.dot(beta_bayes)+np.mean(ytrain)\n",
    "        yoos_gibbs=np.zeros((50,yoos.shape[0]))\n",
    "\n",
    "        for i in range(50):\n",
    "\n",
    "            yoos_gibbs[i]=xoosd.dot(beta_gibbs[i])+np.mean(ytrain)\n",
    "        \n",
    "        yoosconf=np.std(yoos_gibbs,axis=0)\n",
    "        ybayesconf=pd.DataFrame(yoosconf, columns = ['ybayesconf'])\n",
    "\n",
    "        date_and_permno = gg3.iloc[0:, [0,1]]\n",
    "        yoosc=gg3.iloc[0:,2:3]\n",
    "        yoosfreqd=yoosfreq.to_frame(name=\"yoosfreq\")\n",
    "        yoosbayesd=yoosbayes.to_frame(name=\"yoosbayesd\")\n",
    "  \n",
    "\n",
    "\n",
    "        mergeout = np.concatenate((date_and_permno,yoosfreqd,yoosbayesd,ybayesconf,yoosc,gg3.iloc[0:, 3:4]),axis=1)\n",
    "        mergeout = pd.DataFrame(mergeout)\n",
    "        mergeout.to_csv('D:/bayesian jmp/lasso_output_rr/lewrollw{}.csv'.format(M), index=False, header=False)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "signals_and_confidence_levels_later_sample()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
